* Settings
version 16
do "$SSDIMed/scripts/_auxiliary/_project_settings.do"

* NOTES from one of the county population files from CDC Wonder
*   ---
*   Dataset: Bridged-Race Population Estimates 1990-2019
*   Query Parameters:
*   States: Alabama (01)
*   Yearly July 1st Estimates: 1990
*   Group By: County; Age; Gender
*   Show Totals: False
*   Show Zero Values: True
*   Data Table: County Level
*   ---
*   Help: See http://wonder.cdc.gov/wonder/help/bridged-race.html for more information.
*   ---
*   Query Date: Jun 8, 2021 10:58:36 AM
*   ---
*   Suggested Citation: United States Department of Health and Human Services (US DHHS), Centers for Disease Control and Prevention
*   (CDC), National Center for Health Statistics (NCHS), Bridged-Race Population Estimates, United States July 1st resident
*   population by state, county, age, sex, bridged-race, and Hispanic origin. Compiled from 1990-1999 bridged-race intercensal
*   population estimates (released by NCHS on 7/26/2004); revised bridged-race 2000-2009 intercensal population estimates (released
*   by NCHS on 10/26/2012); and bridged-race Vintage 2019 (2010-2019) postcensal population estimates (released by NCHS on
*   7/9/2020). Available on CDC WONDER Online Database. Accessed at http://wonder.cdc.gov/bridged-race-v2019.html on Jun 8, 2021
*   10:58:36 AM
*   ---
*   Footnotes:
*   1. Estimates for 1990-1999 are bridged-race intercensal population estimates of the July 1 resident population. Estimates for
*   2000-2009 are revised bridged-race intercensal estimates of the July 1 resident population. Estimates for 2010-2019 are
*   bridged-race Vintage 2019 postcensal estimates of the July 1 resident population. These estimates were prepared by the Census
*   Bureau in collaboration with NCHS.
*   ---
*   Caveats:
*   1. County geography changes over time. New counties are created and old counties are deleted or their boundaries are modified.
*   The county codes and names for years 1990-1999 are based on Census 2000 geography; those for year 2000 and later are based on
*   Census 2010 geography.
*   2. The U.S. Census Bureau annually releases unbridged population estimates for five-year age groups and race at the county level
*   (http://www.census.gov/popest/research/eval-estimates/eval-est2010.html). The Census Bureau does not release bridged-race or
*   unbridged estimates by single year of age at the county level due to concerns about the reliability of these estimates. However,
*   these estimates are provided to the National Center for Health Statistics to meet programmatic needs such as the creation of age
*   groupings that differ from the standard groupings used by the Census Bureau. Users of the single-year-of-age county-level
*   bridged race population estimates should carefully consider the limited reliability of these estimates.

* Load county population estimates, in separate files by state and year
local state_fips_list 01 02 04 05 06 08 09 10 11 12 13 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 44 45 46 47 48 49 50 51 53 54 55 56 
qui foreach state_fips of local state_fips_list {
forvalues year = 1990/2019 {
  * local state_fips 48
  * local year 1990
  noisily di "State: `state_fips', Year: `year'"
  import delimited "$SSDIMed/data/raw/census/popest/1990-2019/`year'/cdc-wonder-county-pop-`state_fips'_`year'.txt", asdouble stringcols(_all) clear
  
  * Drop observations corresponding to data notes
  assert missing(age) == !missing(notes)
  drop if !missing(notes)
  drop notes

  * Clean gender variables
  assert inlist(gender, "Male", "Female") & inlist(gendercode, "M", "F")
  assert (gender == "Male") == (gendercode == "M")
  drop gender
  rename gendercode gender
  
  * Confirm balance in county-age-gender bins
  qui glevelsof countycode
  local num_counties = r(J)
  qui glevelsof age
  local num_ages = r(J)
  qui glevelsof gender
  local num_gender = r(J)
  assert `num_gender' == 2
  assert _N == `num_counties' * `num_ages' * `num_gender'
  
  * Year
  gen year = `year'
  cap drop yearlyjuly1stestimates*
  
  tempfile county_pop_`state_fips'_`year'
  save `county_pop_`state_fips'_`year''
}
}

* Combine into a single file
clear
qui foreach state_fips of local state_fips_list {
forvalues year = 1990/2019 {
  append using `county_pop_`state_fips'_`year''
}
}

* QC: observations
assert strlen(countycode) == 5
rename countycode fipscounty
gisid fipscounty year age gender
label var fipscounty "County FIPS, Census 2000 geography for years 1990-1999, 2010 geography for 2000+"
label var county "County name"

* Clean age values and labels
assert !missing(age, agecode)
assert (agecode == "85+") if (age == "85+ years")
replace agecode = "85" if agecode == "85+"
destring agecode, replace
bys agecode: assert age == age[1]
rename age age_lbl
rename agecode age
labmask age, values(age_lbl)
label list age
drop age_lbl
label var age "Single year of age through age 84 years, age 85 and older"

* Clean population
destring population, gen(pop) force 
assert population == "Missing" == mi(pop)
drop population 
rename pop population 
assert population >= 0
label var population "Population, July 1st Estimate, Bridged-Race Population Estimates 1990-2019"

* QC: balanced panel
qui glevelsof fipscounty
local n_county = r(J)
qui glevelsof age
local n_age = r(J)
qui glevelsof gender
local n_gender = r(J)
assert `n_gender' == 2
qui glevelsof year
local n_year = r(J)
assert _N == `n_county' * `n_age' * `n_gender' * `n_year'

* Note: while this is a balanced panel in observations, population estimates may be missing for counties that experienced changes.
* Example: Skagway Municipality (FIPS code = 02230) was created effective June 20, 2007
sort fipscounty age gender year
assert missing(population) == inrange(year, 1990, 1999) if fipscounty == "02230"
list if fipscounty == "02230" & age == 65

* But in only one case are population estimates only partially complete for the periods 1990-1999 and 2000-2019
* Source: https://www.cdc.gov/nchs/data/nvss/bridged_race/County_Geography_Changes.pdf
*   Bedford City, Virginia (FIPS code = 51515). Effective July 1, 2013, Bedford city, Virginia (51515), formerly an independent city,
*   was added to Bedford County (51019. Beginning with the Vintage 2014 postcensal series, estimates for this county equivalent
*   no longer appear on the bridged-race population files.
assert missing(population) == inrange(year, 2010, 2019) if fipscounty == "51515"
list if fipscounty == "51515" & age == 65

* QC: For all other counties, population estimates are never only partially complete for the periods 1990-1999 and 2000-2019
gen flag = missing(population)
gen period = inrange(year, 2000, 2019)
bys fipscounty period (flag): assert flag == flag[1] if !inlist(fipscounty, "51515")
drop flag period


* Save datasets

* county-age-sex annual data (LONG format)
compress
bys fipscounty age gender year: assert _N == 1
cap mkdir "$SSDIMed/data/proc/census"
cap mkdir "$SSDIMed/data/proc/census/popest"
save "$SSDIMed/data/proc/census/popest/cdc-wonder-county-pop_byagegender_1990-2019.dta", replace


* county-age-sex annual data (WIDE format)
use "$SSDIMed/data/proc/census/popest/cdc-wonder-county-pop_byagegender_1990-2019.dta", clear

rename population pop
gegen double pop_all   = total(pop), by(fipscounty year)
gegen double pop_18_60 = total(pop) if inrange(age, 18, 60), by(fipscounty year)
gegen double pop_18_60 = min(pop_18_60), by(fipscounty year) replace
gegen double pop_19_61 = total(pop) if inrange(age, 19, 61), by(fipscounty year)
gegen double pop_19_61 = min(pop_19_61), by(fipscounty year) replace
gegen double pop_18_65 = total(pop) if inrange(age, 18, 65), by(fipscounty year)
gegen double pop_18_65 = min(pop_18_65), by(fipscounty year) replace
label var pop_all "Population, all ages"
label var pop_18_60 "Population, ages 18-60"
label var pop_19_61 "Population, ages 19-61"
label var pop_18_65 "Population, ages 18-65"

gisid fipscounty year gender age
greshape wide pop, by(fipscounty year gender) keys(age)

gisid fipscounty year gender
replace gender = "_" + lower(gender)
greshape wide pop*, by(fipscounty year) keys(gender)

drop county
compress 
gisid fipscounty year
save "$SSDIMed/data/proc/census/popest/cdc-wonder-county-pop_byagegender_1990-2019_wide.dta", replace


* county-age annual data (LONG format)
use "$SSDIMed/data/proc/census/popest/cdc-wonder-county-pop_byagegender_1990-2019.dta", clear
by fipscounty: assert county == county[1]
gcollapse (nansum) population, by(county fipscounty age year) labelformat(#sourcelabel#)
bys fipscounty age year: assert _N == 1
compress
cf _all using "$SSDIMed/data/proc/census/popest/cdc-wonder-county-pop_byage_1990-2019.dta", all
save "$SSDIMed/data/proc/census/popest/cdc-wonder-county-pop_byage_1990-2019.dta", replace


* county-age annual data (WIDE format)
use "$SSDIMed/data/proc/census/popest/cdc-wonder-county-pop_byage_1990-2019.dta", clear

rename population pop
gegen double pop_all   = total(pop), by(fipscounty year)
gegen double pop_18_60 = total(pop) if inrange(age, 18, 60), by(fipscounty year)
gegen double pop_18_60 = min(pop_18_60), by(fipscounty year) replace
gegen double pop_19_61 = total(pop) if inrange(age, 19, 61), by(fipscounty year)
gegen double pop_19_61 = min(pop_19_61), by(fipscounty year) replace
gegen double pop_18_65 = total(pop) if inrange(age, 18, 65), by(fipscounty year)
gegen double pop_18_65 = min(pop_18_65), by(fipscounty year) replace
label var pop_all "Population, all ages"
label var pop_18_60 "Population, ages 18-60"
label var pop_19_61 "Population, ages 19-61"
label var pop_18_65 "Population, ages 18-65"

greshape wide pop, by(fipscounty year) keys(age)

drop county
compress 
gisid fipscounty year
save "$SSDIMed/data/proc/census/popest/cdc-wonder-county-pop_byage_1990-2019_wide.dta", replace



** EOF
